library(here)
## here() starts at /Users/adelheid/Documents/MEDS/EDS_222/eds222_final_project
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(tmap)

Background

Data Wrangling: All Species

Reading In Data

I used data from two sources: California Department of fish and wildlife and California Protected areas database.

salmon_populations <- read_csv(here("data/Salmonid_Population_Monitoring_Data_CMPv2021.csv"))
## New names:
## Rows: 3918 Columns: 26
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (18): Watershed, Population, Species, Life Stage, Origin, Run designatio... dbl
## (4): ID, CDFW region, GEO_ID_POLY, GEO_ID_PT num (3): Value, X95 lower CI, X95
## upper CI lgl (1): ...26
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...26`
watershed <- st_read(here("data/ds3001/ds3001.gdb"))
## Multiple layers are present in data source /Users/adelheid/Documents/MEDS/EDS_222/eds222_final_project/data/ds3001/ds3001.gdb, reading layer `ds3001'.
## Use `st_layers' to list all layer names and their type in a data source.
## Set the `layer' argument in `st_read' to read a particular layer.
## Warning in evalq((function (..., call. = TRUE, immediate. = FALSE, noBreaks. =
## FALSE, : automatically selected the first layer in a data source containing more
## than one.
## Reading layer `ds3001' from data source 
##   `/Users/adelheid/Documents/MEDS/EDS_222/eds222_final_project/data/ds3001/ds3001.gdb' 
##   using driver `OpenFileGDB'
## Simple feature collection with 157 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -370211.5 ymin: -445468.7 xmax: 179047.7 ymax: 465217.3
## Projected CRS: NAD83 / California Albers
protected_areas <- st_read(here("data/CPAD_2022a/CPAD_2022a_Holdings.dbf"))
## Reading layer `CPAD_2022a_Holdings' from data source 
##   `/Users/adelheid/Documents/MEDS/EDS_222/eds222_final_project/data/CPAD_2022a/CPAD_2022a_Holdings.dbf' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 94534 features and 35 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -374984.2 ymin: -604454.8 xmax: 540016.3 ymax: 449743.2
## Projected CRS: NAD83 / California Albers

Filtering to Adult Salmon Data

In this section I refine the data, for this study I am interested in looking at adult population counts, so I select data on Adults.

Not every watershed polygon provided by the CMP has count data for it. I am taking out polygons with no adult data associated. There are 157 polygons of watersheds, I have adult counts for 110 of them.

spawning_data <- salmon_populations |> filter(`Life Stage` %in% "Adult") |> #all adult salmon
  select("Population", "Watershed", "Species", Brood_year = "Brood Year", "GEO_ID_POLY", "Value", "Metric", "Estimation method") |> #selecting relevant columns
  filter(!is.na(GEO_ID_POLY)) #taking out data with no matching spatial id

watershed_id <- unique(spawning_data$GEO_ID_POLY) #making a list of all the watersheds that have adult population data 

watershed_new <- watershed |> filter(GEO_ID_POL %in% watershed_id) |> st_make_valid() #filter to watersheds that have spawning data available


protected <- protected_areas |> select(UNIT_NAME, YR_EST) |> st_make_valid()
#selecting relevant columns

Filtering Protected Areas

protected2 <- protected |> filter(YR_EST < 2009| YR_EST %in% c(NA)) #remove after 2009

Calculating Percent Protected: Geo spatial Wrangling

#Filter to protected areas within watersheds of interest and find the area of intersection for each 
intersect_polygons <- st_intersection(protected2, watershed_new) |> 
   dplyr::select(Name, GEO_ID_POL) #select relevant columns
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
#find total area protected for each watershed 
total_overlap <- intersect_polygons |> group_by(Name) |> #group by watershed
  summarize(geometry = st_union(geometry))|> #combine geometries within watershed
  mutate(total_protected = st_area(geometry)) #find total protected 

# dropping geometry 
total_overlap_geomless <- total_overlap |> st_drop_geometry()

watershed_area <- watershed_new |> mutate(total_area = st_area(Shape)) #find the total area of each watershed 

watershed_protected <- left_join(watershed_area, total_overlap_geomless, by = "Name") #add area protected column by joining

#calculate percent protected 
watershed_final <- watershed_protected |> 
mutate(percent_protected = 
         as.numeric((total_protected)/ as.numeric(total_area)) *100) |> 
  mutate(percent_protected = round(percent_protected, digits = 0)) |> #round 
  mutate(percent_protected  = replace_na(percent_protected, 0)) #change NA to 0

#drop geometry and make it a data frame
watershed_geomless <- st_drop_geometry(watershed_final) |> as.data.frame() |> 
  select(- Method_Typ)

#combine spawning observations with percent protected
all_data <- left_join(spawning_data, watershed_geomless, by = c("GEO_ID_POLY" = "GEO_ID_POL")) |> select("Population", "Species", "Value", "percent_protected", "Name", "Brood_year", "Metric", "Estimation method")
tmap_mode("view")
## tmap mode set to interactive viewing
tmap_options(check.and.fix = TRUE)

tm_shape(total_overlap) + tm_fill(col = "#004600") + #map protected portions
  tm_shape(watershed_new) + tm_borders(col = "blue") + tm_add_legend(labels = c( "Watershed Boundary", "Protected area"), col = c("blue", "#004600")) #map watershed boundaries
## Warning: The shape total_overlap is invalid (after reprojection). See
## sf::st_is_valid

Steelhead

Data Wrangling

steelhead <- all_data |> filter(Species == "Steelhead") #filter for steelhead


steelhead_summary <- steelhead |>  group_by(Population, Brood_year) |> summarize(Value = max(Value), percent_protected = mean(percent_protected)) #taking the mean estimation for years where estimate done multiple ways 
## `summarise()` has grouped output by 'Population'. You can override using the
## `.groups` argument.
data_years <- steelhead_summary |> group_by(Population) |> summarize(min_year = min(Brood_year), max_year = max(Brood_year), total_year = length(Brood_year)) #find what years I have data over

#making list of sites where I have consistent data between 2009 and 2018
use_list <- data_years |> filter(min_year <= 2009) |> filter(max_year >= 2018)

list <- use_list$Population

#selecting populations in my list between 2009 and 2018
steelhead_new <- steelhead_summary |> filter(Population %in% list) |> filter(Brood_year >= 2009) |> filter(Brood_year <= 2018)


#remove any populations which are zero over this whole time period
steelhead_zero <- steelhead_new |> group_by(Population) |> summarize(max = max(Value)) |> filter(max == 0)
remove <- steelhead_zero$Population


steelhead_usable <- steelhead_new |> filter(!Population %in% remove)

#removing sites that do not have data at the min or max year 
remove_list <- steelhead_usable |> group_by(Population) |> summarize(min_year = min(Brood_year), max_year = max(Brood_year), total_year = length(Brood_year)) |> 
  filter(max_year != 2018 | min_year != 2009)

remove <- remove_list$Population


steelhead_final <- steelhead_usable |> filter(!Population %in% remove) |> 
  mutate(year = as.numeric(Brood_year)) #make brood year numeric

Regression

\(population = B_0 + B_1year_t + B_2Year_t * percentProtected +E_i\)

summary(lm(Value~year + year:percent_protected, data = steelhead_final))
## 
## Call:
## lm(formula = Value ~ year + year:percent_protected, data = steelhead_final)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -574.7 -480.8 -422.3 -111.0 8701.4 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)            -2.533e+04  4.559e+04  -0.556    0.579
## year                    1.284e+01  2.264e+01   0.567    0.571
## year:percent_protected -1.286e-04  1.010e-03  -0.127    0.899
## 
## Residual standard error: 1192 on 328 degrees of freedom
## Multiple R-squared:  0.001025,   Adjusted R-squared:  -0.005066 
## F-statistic: 0.1683 on 2 and 328 DF,  p-value: 0.8452
#running a regression with all the steelhead data
steelhead_all <- steelhead_summary |> mutate(year = as.numeric(Brood_year))

summary(lm(Value~year + year:percent_protected, data = steelhead_all))
## 
## Call:
## lm(formula = Value ~ year + year:percent_protected, data = steelhead_all)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -541.5  -428.9  -343.3  -158.1 15784.8 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)            -2.060e+04  1.328e+04  -1.551    0.121
## year                    1.046e+01  6.597e+00   1.586    0.113
## year:percent_protected -1.236e-04  6.951e-04  -0.178    0.859
## 
## Residual standard error: 1302 on 846 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.003159,   Adjusted R-squared:  0.0008023 
## F-statistic:  1.34 on 2 and 846 DF,  p-value: 0.2623

Assumptions of OLS

Linear in Parameters

My data does not necessarily appear to be linear in parameters, which may cause it violate the first assumption of OLS.

ggplot(data = steelhead_final, aes(x = year, y = Value)) + geom_point() + geom_smooth(method=lm, se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

X has variation

This is true

Assumption 4

These do not appear to be normally distributed.

model <- lm(Value~year + year:percent_protected, data = steelhead_final)
residuals <- model$residuals |> as.data.frame()


ggplot(data = residuals) + geom_histogram(aes(x = model$residuals))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Coho

coho <- all_data |> filter(Species == "Coho salmon")

coho2 <- coho |>  group_by(Population, Brood_year) |> summarize(Value = max(Value), percent_protected = max(percent_protected)) #taking the max estimation for years where estimate done in multiple ways 
## `summarise()` has grouped output by 'Population'. You can override using the
## `.groups` argument.
data_years <- coho2 |> group_by(Population) |> summarize(min_year = min(Brood_year), max_year = max(Brood_year), total_year = length(Brood_year)) #find what years I have data over

use_list <- data_years |> filter(min_year <= 2009) |> filter(max_year >= 2018)

list <- use_list$Population


coho_usable <- coho2 |> filter(Population %in% list) |> filter(Brood_year >= 2009) |> filter(Brood_year <= 2018)

#remove any populations which are zero over this whole time period
coho12 <- coho_usable |> group_by(Population) |> summarize(max = max(Value)) |> filter(max == 0)
remove <- coho12$Population


coho_usable2 <- coho_usable |> filter(!Population %in% remove)

remove_list <- coho_usable |> group_by(Population) |> summarize(min_year = min(Brood_year), max_year = max(Brood_year), total_year = length(Brood_year)) |> 
  filter(max_year != 2018 | min_year != 2009)

remove <- remove_list$Population


coho_final <- coho_usable |> filter(!Population %in% remove)

ggplot(data = coho_final) + geom_point(aes(x = Brood_year, y = Value))

coho_final <- coho_final |>  mutate(year = as.numeric(Brood_year))

summary(lm(Value~year + year:percent_protected, data = coho_final))
## 
## Call:
## lm(formula = Value ~ year + year:percent_protected, data = coho_final)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -552.6 -326.7 -201.8  -12.1 4632.9 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            -5.072e+04  3.604e+04  -1.407   0.1610  
## year                    2.544e+01  1.790e+01   1.421   0.1570  
## year:percent_protected -1.401e-03  7.219e-04  -1.940   0.0538 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 705.9 on 185 degrees of freedom
## Multiple R-squared:  0.03007,    Adjusted R-squared:  0.01958 
## F-statistic: 2.867 on 2 and 185 DF,  p-value: 0.05937
acf(coho_final$Value, lag.max = 10)

Limitations and Issues

There were a lot of NA s in the year protected roughly 32 % of the polygons I used The definition of protected area was very loose. The data was not consistent over time, so I only used data from 2008-2018. Some watersheds within the study have hatchery releases which are not consistent over time, this can wildly affect the population from year to year.